/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

	.file	"__vsqrtf_ultra3.S"

#include "libm.h"
	.weak	__vsqrtf
	.type	__vsqrtf,#function
	__vsqrtf = __vsqrtf_ultra3

	RO_DATA
	.align	64

.CONST_TBL:
	.word	0x3fe00001, 0x80007e00	! K1  =  5.00000715259318464227e-01
	.word	0xbfc00003, 0xc0017a01	! K2  = -1.25000447037521686593e-01
	.word	0x000fffff, 0xffffffff	! DC0 = 0x000fffffffffffff
	.word	0x3ff00000, 0x00000000	! DC1 = 0x3ff0000000000000
	.word	0x7ffff000, 0x00000000	! DC2 = 0x7ffff00000000000

#define DC0		%f6
#define DC1		%f4
#define DC2		%f2
#define K2		%f38
#define K1		%f36
#define TBL		%l2
#define stridex		%l3
#define stridey		%l4
#define _0x1ff0		%l5
#define counter		%l6
#define _0x00800000	%l7
#define _0x7f800000	%o0

#define tmp_px		STACK_BIAS-0x40
#define tmp_counter	STACK_BIAS-0x38
#define tmp0		STACK_BIAS-0x30
#define tmp1		STACK_BIAS-0x28
#define tmp2		STACK_BIAS-0x20
#define tmp3		STACK_BIAS-0x18
#define tmp4		STACK_BIAS-0x10

! sizeof temp storage - must be a multiple of 16 for V9
#define tmps		0x40

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!      !!!!!   algorithm   !!!!!
!
!  x0 = *px;
!  ax = *(int*)px;
!  px += stridex;
!
!  if( ax >= 0x7f800000 )
!  {
!    *py = sqrtf(x0);
!    py += stridey;
!    continue;
!  }
!  if( ax < 0x00800000 )
!  {
!    *py = sqrtf(x0);
!    py += stridey;
!    continue;
!  }
!
!  db0 = (double)x0;
!  iexp0 = ax >> 24;
!  iexp0 += 0x3c0;
!  lexp0 = (long long)iexp0 << 52;
!
!  db0 = vis_fand(db0,DC0);
!  db0 = vis_for(db0,DC1);
!  hi0 = vis_fand(db0,DC2);
!
!  ax >>= 11;
!  si0 = ax & 0x1ff0;
!  dtmp0 = ((double*)((char*)TBL + si0))[0];
!  xx0 = (db0 - hi0);
!  xx0 *= dtmp0;
!  dtmp0 = ((double*)((char*)TBL + si0))[1]
!  res0 = K2 * xx0;
!  res0 += K1;
!  res0 *= xx0;
!  res0 += DC1;
!  res0 = dtmp0 * res0;
!  dtmp1 = *((double*)&lexp0);
!  res0 *= dtmp1;
!  fres0 = (float)res0;
!  *py = fres0;
!  py += stridey;
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

	ENTRY(__vsqrtf_ultra3)
	save	%sp,-SA(MINFRAME)-tmps,%sp
	PIC_SETUP(l7)
	PIC_SET(l7,.CONST_TBL,o2)
	PIC_SET(l7,__vlibm_TBL_sqrtf,l2)

	st	%i0,[%fp+tmp_counter]
	sll	%i2,2,stridex
	or	%g0,0xff8,%l5

	stx	%i1,[%fp+tmp_px]
	sll	%l5,1,_0x1ff0

	ldd	[%o2],K1
	sll	%i4,2,stridey

	ldd	[%o2+8],K2
	or	%g0,%i3,%g5

	ldd	[%o2+16],DC0
	sethi	%hi(0x7f800000),%o0

	ldd	[%o2+24],DC1
	sethi	%hi(0x00800000),%l7

	ldd	[%o2+32],DC2

.begin:
	ld	[%fp+tmp_counter],counter
	ldx	[%fp+tmp_px],%i1
	st	%g0,[%fp+tmp_counter]
.begin1:
	cmp	counter,0
	ble,pn	%icc,.exit

	lda	[%i1]0x82,%o2		! (2_0) ax = *(int*)px;

	or	%g0,%i1,%o7
	lda	[%i1]0x82,%f25		! (2_0) x0 = *px;

	cmp	%o2,_0x7f800000		! (2_0) ax ? 0x7f800000
	bge,pn	%icc,.spec		! (2_0) if( ax >= 0x7f800000 )
	nop

	cmp	%o2,_0x00800000		! (2_0) ax ? 0x00800000
	bl,pn	%icc,.spec		! (2_0) if( ax < 0x00800000 )
	nop

	fstod	%f25,%f56		! (2_0) db0 = (double)x0;

	lda	[stridex+%o7]0x82,%o1	! (3_0) ax = *(int*)px;

	sra	%o2,24,%l1		! (2_0) iexp0 = ax >> 24;

	add	%o7,stridex,%i1		! px += stridex
	add	%l1,960,%l0		! (2_0) iexp0 += 0x3c0;
	lda	[stridex+%o7]0x82,%f0	! (3_0) x0 = *px;
	fand	%f56,DC0,%f60		! (2_0) db0 = vis_fand(db0,DC0);

	cmp	%o1,_0x7f800000		! (3_0) ax ? 0x7f800000
	bge,pn	%icc,.update0		! (3_0) if( ax >= 0x7f800000 )
	nop
.cont0:
	sllx	%l0,52,%o3		! (2_0) lexp0 = (long long)iexp0 << 52;

	sra	%o2,11,%i2		! (2_0) ax >>= 11;
	stx	%o3,[%fp+tmp0]		! (2_0) dtmp1 = *((double*)&lexp0);
	for	%f60,DC1,%f40		! (2_0) db0 = vis_for(db0,DC1);

	cmp	%o1,_0x00800000		! (3_0) ax ? 0x00800000
	bl,pn	%icc,.update1		! (3_0) if( ax < 0x00800000 )
	nop
.cont1:
	fstod	%f0,%f48		! (3_0) db0 = (double)x0;

	and	%i2,_0x1ff0,%o3		! (2_0) si0 = ax & 0x1ff0;
	lda	[%i1+stridex]0x82,%o2	! (4_0) ax = *(int*)px;

	add	%i1,stridex,%i1		! px += stridex
	add	%o3,TBL,%i2		! (2_0) (char*)TBL + si0
	fand	%f40,DC2,%f46		! (2_0) hi0 = vis_fand(db0,DC2);

	sra	%o1,24,%o4		! (3_0) iexp0 = ax >> 24;

	lda	[%i1]0x82,%f13		! (4_0) x0 = *px;
	fand	%f48,DC0,%f58		! (3_0) db0 = vis_fand(db0,DC0);

	add	%o4,960,%i0		! (3_0) iexp0 += 0x3c0;

	cmp	%o2,_0x7f800000		! (4_1) ax ? 0x7f800000
	bge,pn	%icc,.update2		! (4_1) if( ax >= 0x7f800000 )
	nop
.cont2:
	fsubd	%f40,%f46,%f44		! (2_1) xx0 = (db0 - hi0);
	sllx	%i0,52,%g1		! (3_1) lexp0 = (long long)iexp0 << 52;
	ldd	[%i2],%f40		! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0];

	sra	%o1,11,%l0		! (3_1) ax >>= 11;
	stx	%g1,[%fp+tmp1]		! (3_1) dtmp1 = *((double*)&lexp0);
	for	%f58,DC1,%f48		! (3_1) db0 = vis_for(db0,DC1);

	cmp	%o2,_0x00800000		! (4_1) ax ? 0x00800000
	bl,pn	%icc,.update3		! (4_1) if( ax < 0x00800000 )
	nop
.cont3:
	fstod	%f13,%f50		! (4_1) db0 = (double)x0;

	fmuld	%f44,%f40,%f46		! (2_1) xx0 *= dtmp0;
	and	%l0,_0x1ff0,%i0		! (3_1) si0 = ax & 0x1ff0;
	lda	[%i1+stridex]0x82,%l1	! (0_0) ax = *(int*)px;

	add	%i0,TBL,%l0		! (3_1) (char*)TBL + si0
	fand	%f48,DC2,%f62		! (3_1) hi0 = vis_fand(db0,DC2);

	sra	%o2,24,%o7		! (4_1) iexp0 = ax >> 24;

	add	%i1,stridex,%o4		! px += stridex
	add	%o7,960,%o7		! (4_1) iexp0 += 0x3c0;
	lda	[%i1+stridex]0x82,%f17	! (0_0) x0 = *px;
	fand	%f50,DC0,%f54		! (4_1) db0 = vis_fand(db0,DC0);

	fmuld	K2,%f46,%f52		! (2_1) res0 = K2 * xx0;
	cmp	%l1,_0x7f800000		! (0_0) ax ? 0x7f800000
	bge,pn	%icc,.update4		! (0_0) if( ax >= 0x7f800000 )
	fsubd	%f48,%f62,%f42		! (3_1) xx0 = (db0 - hi0);
.cont4:
	sllx	%o7,52,%o1		! (4_1) lexp0 = (long long)iexp0 << 52;
	ldd	[%i0+TBL],%f40		! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0];

	sra	%o2,11,%i5		! (4_1) ax >>= 11;
	stx	%o1,[%fp+tmp2]		! (4_1) dtmp1 = *((double*)&lexp0);
	for	%f54,DC1,%f34		! (4_1) db0 = vis_for(db0,DC1);

	cmp	%l1,_0x00800000		! (0_0) ax ? 0x00800000
	bl,pn	%icc,.update5		! (0_0) if( ax < 0x00800000 )
	nop
.cont5:
	fstod	%f17,%f56		! (0_0) db0 = (double)x0;

	fmuld	%f42,%f40,%f42		! (3_1) xx0 *= dtmp0;
	lda	[stridex+%o4]0x82,%i0	! (1_0) ax = *(int*)px;
	faddd	%f52,K1,%f52		! (2_1) res0 += K1;

	sra	%l1,24,%g1		! (0_0) iexp0 = ax >> 24;
	and	%i5,_0x1ff0,%i5		! (4_1) si0 = ax & 0x1ff0;
	fand	%f34,DC2,%f62		! (4_1) hi0 = vis_fand(db0,DC2);

	add	%o4,stridex,%i1		! px += stridex

	add	%g1,960,%o5		! (0_0) iexp0 += 0x3c0;
	add	%i5,TBL,%i3		! (4_1) (char*)TBL + si0
	lda	[stridex+%o4]0x82,%f21	! (1_0) x0 = *px;
	fand	%f56,DC0,%f32		! (0_0) db0 = vis_fand(db0,DC0);

	fmuld	K2,%f42,%f50		! (3_1) res0 = K2 * xx0;
	cmp	%i0,_0x7f800000		! (1_0) ax ? 0x7f800000
	bge,pn	%icc,.update6		! (1_0) if( ax >= 0x7f800000 )
	fsubd	%f34,%f62,%f54		! (4_1) xx0 = (db0 - hi0);
.cont6:
	fmuld	%f52,%f46,%f52		! (2_1) res0 *= xx0;
	sllx	%o5,52,%o7		! (0_0) lexp0 = (long long)iexp0 << 52;
	ldd	[TBL+%i5],%f62		! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0];

	sra	%l1,11,%i4		! (0_0) ax >>= 11;
	stx	%o7,[%fp+tmp3]		! (0_0) dtmp1 = *((double*)&lexp0);
	for	%f32,DC1,%f48		! (0_0) db0 = vis_for(db0,DC1);

	cmp	%i0,_0x00800000		! (1_0) ax ? 0x00800000
	bl,pn	%icc,.update7		! (1_0) if( ax < 0x00800000 )
	nop
.cont7:
	fstod	%f21,%f56		! (1_0) db0 = (double)x0;

	fmuld	%f54,%f62,%f46		! (4_1) xx0 *= dtmp0;
	and	%i4,_0x1ff0,%g1		! (0_0) si0 = ax & 0x1ff0;
	lda	[%i1+stridex]0x82,%o2	! (2_0) ax = *(int*)px;
	faddd	%f50,K1,%f62		! (3_1) res0 += K1;

	add	%g1,TBL,%i5		! (0_0) (double*)((char*)TBL + si0
	fand	%f48,DC2,%f32		! (0_0) hi0 = vis_fand(db0,DC2);

	sra	%i0,24,%o4		! (1_0) iexp0 = ax >> 24;
	ldd	[%i2+8],%f60		! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
	faddd	%f52,DC1,%f58		! (2_1) res0 += DC1;

	add	%i1,stridex,%o7		! px += stridex
	add	%o4,960,%i2		! (1_0) iexp0 += 0x3c0;
	lda	[%i1+stridex]0x82,%f25	! (2_0) x0 = *px;
	fand	%f56,DC0,%f34		! (1_0) db0 = vis_fand(db0,DC0);

	fmuld	K2,%f46,%f50		! (4_1) res0 = K2 * xx0;
	cmp	%o2,_0x7f800000		! (2_0) ax ? 0x7f800000
	bge,pn	%icc,.update8		! (2_0) if( ax >= 0x7f800000 )
	fsubd	%f48,%f32,%f52		! (0_0) xx0 = (db0 - hi0);
.cont8:
	fmuld	%f62,%f42,%f54		! (3_1) res0 *= xx0;
	sllx	%i2,52,%o4		! (1_0) lexp0 = (long long)iexp0 << 52;
	ldd	[TBL+%g1],%f32		! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0];

	fmuld	%f60,%f58,%f60		! (2_1) res0 = dtmp0 * res0;
	sra	%i0,11,%g1		! (1_0) ax >>= 11;
	stx	%o4,[%fp+tmp4]		! (1_0) dtmp1 = *((double*)&lexp0);
	for	%f34,DC1,%f48		! (1_0) db0 = vis_for(db0,DC1);

	cmp	%o2,_0x00800000		! (2_0) ax ? 0x00800000
	bl,pn	%icc,.update9		! (2_0) if( ax < 0x00800000 )
	ldd	[%fp+tmp0],%f40		! (2_1) dtmp1 = *((double*)&lexp0);
	fstod	%f25,%f56		! (2_0) db0 = (double)x0;
.cont9:
	fmuld	%f52,%f32,%f42		! (0_0) xx0 *= dtmp0;
	and	%g1,_0x1ff0,%o5		! (1_0) si0 = ax & 0x1ff0;
	lda	[stridex+%o7]0x82,%o1	! (3_0) ax = *(int*)px;
	faddd	%f50,K1,%f34		! (4_1) res0 += K1;

	add	%o5,TBL,%i4		! (1_0) (char*)TBL + si0
	fand	%f48,DC2,%f62		! (1_0) hi0 = vis_fand(db0,DC2);

	fmuld	%f60,%f40,%f32		! (2_1) res0 *= dtmp1;
	sra	%o2,24,%l1		! (2_0) iexp0 = ax >> 24;
	ldd	[%l0+8],%f40		! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
	faddd	%f54,DC1,%f58		! (3_1) res0 += DC1;

	add	%o7,stridex,%i1		! px += stridex
	add	%l1,960,%l0		! (2_0) iexp0 += 0x3c0;
	lda	[stridex+%o7]0x82,%f0	! (3_0) x0 = *px;
	fand	%f56,DC0,%f60		! (2_0) db0 = vis_fand(db0,DC0);

	fmuld	K2,%f42,%f50		! (0_0) res0 = K2 * xx0;
	cmp	%o1,_0x7f800000		! (3_0) ax ? 0x7f800000
	bge,pn	%icc,.update10		! (3_0) if( ax >= 0x7f800000 )
	fsubd	%f48,%f62,%f54		! (1_0) xx0 = (db0 - hi0);
.cont10:
	fmuld	%f34,%f46,%f52		! (4_1) res0 *= xx0;
	sllx	%l0,52,%o3		! (2_0) lexp0 = (long long)iexp0 << 52;
	ldd	[TBL+%o5],%f56		! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0];

	fmuld	%f40,%f58,%f34		! (3_1) res0 = dtmp0 * res0;
	sra	%o2,11,%i2		! (2_0) ax >>= 11;
	stx	%o3,[%fp+tmp0]		! (2_0) dtmp1 = *((double*)&lexp0);
	for	%f60,DC1,%f40		! (2_0) db0 = vis_for(db0,DC1);

	cmp	%o1,_0x00800000		! (3_0) ax ? 0x00800000
	bl,pn	%icc,.update11		! (3_0) if( ax < 0x00800000 )
	ldd	[%fp+tmp1],%f62		! (3_1) dtmp1 = *((double*)&lexp0);
	fstod	%f0,%f48		! (3_0) db0 = (double)x0;
.cont11:
	fmuld	%f54,%f56,%f30		! (1_0) xx0 *= dtmp0;
	and	%i2,_0x1ff0,%o3		! (2_0) si0 = ax & 0x1ff0;
	lda	[%i1+stridex]0x82,%o2	! (4_0) ax = *(int*)px;
	faddd	%f50,K1,%f56		! (0_0) res0 += K1;

	add	%i1,stridex,%i1		! px += stridex
	add	%o3,TBL,%i2		! (2_0) (char*)TBL + si0
	fand	%f40,DC2,%f46		! (2_0) hi0 = vis_fand(db0,DC2);

	fmuld	%f34,%f62,%f28		! (3_1) res0 *= dtmp1;
	sra	%o1,24,%o4		! (3_0) iexp0 = ax >> 24;
	ldd	[%i3+8],%f50		! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
	faddd	%f52,DC1,%f54		! (4_1) res0 += DC1;

	lda	[%i1]0x82,%f13		! (4_0) x0 = *px;
	fand	%f48,DC0,%f58		! (3_0) db0 = vis_fand(db0,DC0);

	or	%g0,%g5,%i3
	cmp	counter,5
	bl,pn	%icc,.tail
	add	%o4,960,%g5		! (3_0) iexp0 += 0x3c0;

	ba	.main_loop
	sub	counter,5,counter	! counter

	.align	16
.main_loop:
	fmuld	K2,%f30,%f60		! (1_1) res0 = K2 * xx0;
	cmp	%o2,_0x7f800000		! (4_1) ax ? 0x7f800000
	bge,pn	%icc,.update12		! (4_1) if( ax >= 0x7f800000 )
	fsubd	%f40,%f46,%f44		! (2_1) xx0 = (db0 - hi0);
.cont12:
	fmuld	%f56,%f42,%f52		! (0_1) res0 *= xx0;
	sllx	%g5,52,%g5		! (3_1) lexp0 = (long long)iexp0 << 52;
	ldd	[%i2],%f40		! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
	fdtos	%f32,%f15		! (2_2) fres0 = (float)res0;

	fmuld	%f50,%f54,%f42		! (4_2) res0 = dtmp0 * res0;
	sra	%o1,11,%l0		! (3_1) ax >>= 11;
	stx	%g5,[%fp+tmp1]		! (3_1) dtmp1 = *((double*)&lexp0);
	for	%f58,DC1,%f48		! (3_1) db0 = vis_for(db0,DC1);

	cmp	%o2,_0x00800000		! (4_1) ax ? 0x00800000
	bl,pn	%icc,.update13		! (4_1) if( ax < 0x00800000 )
	ldd	[%fp+tmp2],%f56		! (4_2) dtmp1 = *((double*)&lexp0);
	fstod	%f13,%f50		! (4_1) db0 = (double)x0;
.cont13:
	fmuld	%f44,%f40,%f46		! (2_1) xx0 *= dtmp0;
	and	%l0,_0x1ff0,%i0		! (3_1) si0 = ax & 0x1ff0;
	lda	[%i1+stridex]0x82,%l1	! (0_0) ax = *(int*)px;
	faddd	%f60,K1,%f32		! (1_1) res0 += K1;

	add	%i0,TBL,%l0		! (3_1) (char*)TBL + si0
	add	%i3,stridey,%o3		! py += stridey
	st	%f15,[%i3]		! (2_2) *py = fres0;
	fand	%f48,DC2,%f62		! (3_1) hi0 = vis_fand(db0,DC2);

	fmuld	%f42,%f56,%f44		! (4_2) res0 *= dtmp1;
	sra	%o2,24,%o7		! (4_1) iexp0 = ax >> 24;
	ldd	[%i5+8],%f58		! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
	faddd	%f52,DC1,%f34		! (0_1) res0 += DC1;

	add	%i1,stridex,%o4		! px += stridex
	add	%o7,960,%o7		! (4_1) iexp0 += 0x3c0;
	lda	[%i1+stridex]0x82,%f17	! (0_0) x0 = *px;
	fand	%f50,DC0,%f54		! (4_1) db0 = vis_fand(db0,DC0);

	fmuld	K2,%f46,%f52		! (2_1) res0 = K2 * xx0;
	cmp	%l1,_0x7f800000		! (0_0) ax ? 0x7f800000
	bge,pn	%icc,.update14		! (0_0) if( ax >= 0x7f800000 )
	fsubd	%f48,%f62,%f42		! (3_1) xx0 = (db0 - hi0);
.cont14:
	fmuld	%f32,%f30,%f48		! (1_1) res0 *= xx0;
	sllx	%o7,52,%o1		! (4_1) lexp0 = (long long)iexp0 << 52;
	ldd	[%i0+TBL],%f40		! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
	fdtos	%f28,%f19		! (3_2) fres0 = (float)res0;

	fmuld	%f58,%f34,%f32		! (0_1) res0 = dtmp0 * res0;
	sra	%o2,11,%i5		! (4_1) ax >>= 11;
	stx	%o1,[%fp+tmp2]		! (4_1) dtmp1 = *((double*)&lexp0);
	for	%f54,DC1,%f34		! (4_1) db0 = vis_for(db0,DC1);

	cmp	%l1,_0x00800000		! (0_0) ax ? 0x00800000
	bl,pn	%icc,.update15		! (0_0) if( ax < 0x00800000 )
	ldd	[%fp+tmp3],%f60		! (0_1) dtmp1 = *((double*)&lexp0);
	fstod	%f17,%f56		! (0_0) db0 = (double)x0;
.cont15:
	fmuld	%f42,%f40,%f42		! (3_1) xx0 *= dtmp0;
	add	%o3,stridey,%g5		! py += stridey
	lda	[stridex+%o4]0x82,%i0	! (1_0) ax = *(int*)px;
	faddd	%f52,K1,%f52		! (2_1) res0 += K1;

	sra	%l1,24,%g1		! (0_0) iexp0 = ax >> 24;
	and	%i5,_0x1ff0,%i5		! (4_1) si0 = ax & 0x1ff0;
	st	%f19,[%o3]		! (3_2) *py = fres0;
	fand	%f34,DC2,%f62		! (4_1) hi0 = vis_fand(db0,DC2);

	fmuld	%f32,%f60,%f40		! (0_1) res0 *= dtmp1;
	add	%o4,stridex,%i1		! px += stridex
	ldd	[%i4+8],%f60		! (1_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
	faddd	%f48,DC1,%f58		! (1_1) res0 += DC1;

	add	%g1,960,%o5		! (0_0) iexp0 += 0x3c0;
	add	%i5,TBL,%i3		! (4_1) (char*)TBL + si0
	lda	[stridex+%o4]0x82,%f21	! (1_0) x0 = *px;
	fand	%f56,DC0,%f32		! (0_0) db0 = vis_fand(db0,DC0);

	fmuld	K2,%f42,%f50		! (3_1) res0 = K2 * xx0;
	cmp	%i0,_0x7f800000		! (1_0) ax ? 0x7f800000
	bge,pn	%icc,.update16		! (1_0) if( ax >= 0x7f800000 )
	fsubd	%f34,%f62,%f54		! (4_1) xx0 = (db0 - hi0);
.cont16:
	fmuld	%f52,%f46,%f52		! (2_1) res0 *= xx0;
	sllx	%o5,52,%o7		! (0_0) lexp0 = (long long)iexp0 << 52;
	ldd	[TBL+%i5],%f62		! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
	fdtos	%f44,%f23		! (4_2) fres0 = (float)res0;

	fmuld	%f60,%f58,%f44		! (1_1) res0 = dtmp0 * res0;
	sra	%l1,11,%i4		! (0_0) ax >>= 11;
	stx	%o7,[%fp+tmp3]		! (0_0) dtmp1 = *((double*)&lexp0);
	for	%f32,DC1,%f48		! (0_0) db0 = vis_for(db0,DC1);

	cmp	%i0,_0x00800000		! (1_0) ax ? 0x00800000
	bl,pn	%icc,.update17		! (1_0) if( ax < 0x00800000 )
	ldd	[%fp+tmp4],%f34		! (1_1) dtmp1 = *((double*)&lexp0);
	fstod	%f21,%f56		! (1_0) db0 = (double)x0;
.cont17:
	fmuld	%f54,%f62,%f46		! (4_1) xx0 *= dtmp0;
	and	%i4,_0x1ff0,%g1		! (0_0) si0 = ax & 0x1ff0;
	lda	[%i1+stridex]0x82,%o2	! (2_0) ax = *(int*)px;
	faddd	%f50,K1,%f62		! (3_1) res0 += K1;

	add	%g1,TBL,%i5		! (0_0) (double*)((char*)TBL + si0
	add	%g5,stridey,%g5		! py += stridey
	st	%f23,[stridey+%o3]	! (4_2) *py = fres0;
	fand	%f48,DC2,%f32		! (0_0) hi0 = vis_fand(db0,DC2);

	fmuld	%f44,%f34,%f44		! (1_1) res0 *= dtmp1;
	sra	%i0,24,%o4		! (1_0) iexp0 = ax >> 24;
	ldd	[%i2+8],%f60		! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
	faddd	%f52,DC1,%f58		! (2_1) res0 += DC1;

	add	%i1,stridex,%o7		! px += stridex
	add	%o4,960,%i2		! (1_0) iexp0 += 0x3c0;
	lda	[%i1+stridex]0x82,%f25	! (2_0) x0 = *px;
	fand	%f56,DC0,%f34		! (1_0) db0 = vis_fand(db0,DC0);

	fmuld	K2,%f46,%f50		! (4_1) res0 = K2 * xx0;
	cmp	%o2,_0x7f800000		! (2_0) ax ? 0x7f800000
	bge,pn	%icc,.update18		! (2_0) if( ax >= 0x7f800000 )
	fsubd	%f48,%f32,%f52		! (0_0) xx0 = (db0 - hi0);
.cont18:
	fmuld	%f62,%f42,%f54		! (3_1) res0 *= xx0;
	sllx	%i2,52,%o4		! (1_0) lexp0 = (long long)iexp0 << 52;
	ldd	[TBL+%g1],%f32		! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
	fdtos	%f40,%f27		! (0_1) fres0 = (float)res0;

	fmuld	%f60,%f58,%f60		! (2_1) res0 = dtmp0 * res0;
	sra	%i0,11,%g1		! (1_0) ax >>= 11;
	stx	%o4,[%fp+tmp4]		! (1_0) dtmp1 = *((double*)&lexp0);
	for	%f34,DC1,%f48		! (1_0) db0 = vis_for(db0,DC1);

	cmp	%o2,_0x00800000		! (2_0) ax ? 0x00800000
	bl,pn	%icc,.update19		! (2_0) if( ax < 0x00800000 )
	ldd	[%fp+tmp0],%f40		! (2_1) dtmp1 = *((double*)&lexp0);
	fstod	%f25,%f56		! (2_0) db0 = (double)x0;
.cont19:
	fmuld	%f52,%f32,%f42		! (0_0) xx0 *= dtmp0;
	and	%g1,_0x1ff0,%o5		! (1_0) si0 = ax & 0x1ff0;
	lda	[stridex+%o7]0x82,%o1	! (3_0) ax = *(int*)px;
	faddd	%f50,K1,%f34		! (4_1) res0 += K1;

	add	%o5,TBL,%i4		! (1_0) (char*)TBL + si0
	add	%g5,stridey,%g1		! py += stridey
	st	%f27,[%g5]		! (0_1) *py = fres0;
	fand	%f48,DC2,%f62		! (1_0) hi0 = vis_fand(db0,DC2);

	fmuld	%f60,%f40,%f32		! (2_1) res0 *= dtmp1;
	sra	%o2,24,%l1		! (2_0) iexp0 = ax >> 24;
	ldd	[%l0+8],%f40		! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
	faddd	%f54,DC1,%f58		! (3_1) res0 += DC1;

	add	%o7,stridex,%i1		! px += stridex
	add	%l1,960,%l0		! (2_0) iexp0 += 0x3c0;
	lda	[stridex+%o7]0x82,%f0	! (3_0) x0 = *px;
	fand	%f56,DC0,%f60		! (2_0) db0 = vis_fand(db0,DC0);

	fmuld	K2,%f42,%f50		! (0_0) res0 = K2 * xx0;
	cmp	%o1,_0x7f800000		! (3_0) ax ? 0x7f800000
	bge,pn	%icc,.update20		! (3_0) if( ax >= 0x7f800000 )
	fsubd	%f48,%f62,%f54		! (1_0) xx0 = (db0 - hi0);
.cont20:
	fmuld	%f34,%f46,%f52		! (4_1) res0 *= xx0;
	sllx	%l0,52,%o3		! (2_0) lexp0 = (long long)iexp0 << 52;
	ldd	[TBL+%o5],%f56		! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
	fdtos	%f44,%f8		! (1_1) fres0 = (float)res0;

	fmuld	%f40,%f58,%f34		! (3_1) res0 = dtmp0 * res0;
	sra	%o2,11,%i2		! (2_0) ax >>= 11;
	stx	%o3,[%fp+tmp0]		! (2_0) dtmp1 = *((double*)&lexp0);
	for	%f60,DC1,%f40		! (2_0) db0 = vis_for(db0,DC1);

	cmp	%o1,_0x00800000		! (3_0) ax ? 0x00800000
	bl,pn	%icc,.update21		! (3_0) if( ax < 0x00800000 )
	ldd	[%fp+tmp1],%f62		! (3_1) dtmp1 = *((double*)&lexp0);
	fstod	%f0,%f48		! (3_0) db0 = (double)x0;
.cont21:
	fmuld	%f54,%f56,%f30		! (1_0) xx0 *= dtmp0;
	and	%i2,_0x1ff0,%o3		! (2_0) si0 = ax & 0x1ff0;
	lda	[%i1+stridex]0x82,%o2	! (4_0) ax = *(int*)px;
	faddd	%f50,K1,%f56		! (0_0) res0 += K1;

	add	%i1,stridex,%i1		! px += stridex
	add	%o3,TBL,%i2		! (2_0) (char*)TBL + si0
	st	%f8,[stridey+%g5]	! (1_1) *py = fres0;
	fand	%f40,DC2,%f46		! (2_0) hi0 = vis_fand(db0,DC2);

	fmuld	%f34,%f62,%f28		! (3_1) res0 *= dtmp1;
	sra	%o1,24,%o4		! (3_0) iexp0 = ax >> 24;
	ldd	[%i3+8],%f50		! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
	faddd	%f52,DC1,%f54		! (4_1) res0 += DC1;

	add	%g1,stridey,%i3		! py += stridey
	subcc	counter,5,counter	! counter
	lda	[%i1]0x82,%f13		! (4_0) x0 = *px;
	fand	%f48,DC0,%f58		! (3_0) db0 = vis_fand(db0,DC0);

	bpos,pt	%icc,.main_loop
	add	%o4,960,%g5		! (3_0) iexp0 += 0x3c0;

	add	counter,5,counter
.tail:
	subcc	counter,1,counter
	bneg,a	.begin
	or	%g0,%i3,%g5

	fmuld	%f56,%f42,%f52		! (0_1) res0 *= xx0;
	fdtos	%f32,%f15		! (2_2) fres0 = (float)res0;

	fmuld	%f50,%f54,%f42		! (4_2) res0 = dtmp0 * res0;

	ldd	[%fp+tmp2],%f56		! (4_2) dtmp1 = *((double*)&lexp0);

	add	%i3,stridey,%o3		! py += stridey
	st	%f15,[%i3]		! (2_2) *py = fres0;

	subcc	counter,1,counter
	bneg,a	.begin
	or	%g0,%o3,%g5

	fmuld	%f42,%f56,%f44		! (4_2) res0 *= dtmp1;
	ldd	[%i5+8],%f58		! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
	faddd	%f52,DC1,%f34		! (0_1) res0 += DC1;

	fdtos	%f28,%f19		! (3_2) fres0 = (float)res0;

	fmuld	%f58,%f34,%f32		! (0_1) res0 = dtmp0 * res0;

	ldd	[%fp+tmp3],%f60		! (0_1) dtmp1 = *((double*)&lexp0);

	add	%o3,stridey,%g5		! py += stridey

	st	%f19,[%o3]		! (3_2) *py = fres0;

	subcc	counter,1,counter
	bneg,a	.begin
	nop

	fmuld	%f32,%f60,%f40		! (0_1) res0 *= dtmp1;

	fdtos	%f44,%f23		! (4_2) fres0 = (float)res0;

	add	%g5,stridey,%g5		! py += stridey
	st	%f23,[stridey+%o3]	! (4_2) *py = fres0;

	subcc	counter,1,counter
	bneg,a	.begin
	nop

	fdtos	%f40,%f27		! (0_1) fres0 = (float)res0;

	st	%f27,[%g5]		! (0_1) *py = fres0;

	ba	.begin
	add	%g5,stridey,%g5

	.align	16
.spec:
	fsqrts	%f25,%f25
	sub	counter,1,counter
	add	%i1,stridex,%i1
	st	%f25,[%g5]
	ba	.begin1
	add	%g5,stridey,%g5

	.align	16
.update0:
	cmp	counter,1
	ble	.cont0
	fzeros	%f0

	stx	%i1,[%fp+tmp_px]
	sethi	%hi(0x7f800000),%o1

	sub	counter,1,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont0
	or	%g0,1,counter

	.align	16
.update1:
	cmp	counter,1
	ble	.cont1
	fzeros	%f0

	stx	%i1,[%fp+tmp_px]
	clr	%o1

	sub	counter,1,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont1
	or	%g0,1,counter

	.align	16
.update2:
	cmp	counter,2
	ble	.cont2
	fzeros	%f13

	stx	%i1,[%fp+tmp_px]
	sethi	%hi(0x7f800000),%o2

	sub	counter,2,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont2
	or	%g0,2,counter

	.align	16
.update3:
	cmp	counter,2
	ble	.cont3
	fzeros	%f13

	stx	%i1,[%fp+tmp_px]
	clr	%o2

	sub	counter,2,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont3
	or	%g0,2,counter

	.align	16
.update4:
	cmp	counter,3
	ble	.cont4
	fzeros	%f17

	stx	%o4,[%fp+tmp_px]
	sethi	%hi(0x7f800000),%l1

	sub	counter,3,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont4
	or	%g0,3,counter

	.align	16
.update5:
	cmp	counter,3
	ble	.cont5
	fzeros	%f17

	stx	%o4,[%fp+tmp_px]
	clr	%l1

	sub	counter,3,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont5
	or	%g0,3,counter

	.align	16
.update6:
	cmp	counter,4
	ble	.cont6
	fzeros	%f21

	stx	%i1,[%fp+tmp_px]
	sethi	%hi(0x7f800000),%i0

	sub	counter,4,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont6
	or	%g0,4,counter

	.align	16
.update7:
	cmp	counter,4
	ble	.cont7
	fzeros	%f21

	stx	%i1,[%fp+tmp_px]
	clr	%i0

	sub	counter,4,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont7
	or	%g0,4,counter

	.align	16
.update8:
	cmp	counter,5
	ble	.cont8
	fzeros	%f25

	stx	%o7,[%fp+tmp_px]
	sethi	%hi(0x7f800000),%o2

	sub	counter,5,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont8
	or	%g0,5,counter

	.align	16
.update9:
	cmp	counter,5
	ble	.cont9
	fzeros	%f25

	stx	%o7,[%fp+tmp_px]
	clr	%o2

	sub	counter,5,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont9
	or	%g0,5,counter

	.align	16
.update10:
	cmp	counter,6
	ble	.cont10
	fzeros	%f0

	stx	%i1,[%fp+tmp_px]
	sethi	%hi(0x7f800000),%o1

	sub	counter,6,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont10
	or	%g0,6,counter

	.align	16
.update11:
	cmp	counter,6
	ble	.cont11
	fzeros	%f0

	stx	%i1,[%fp+tmp_px]
	clr	%o1

	sub	counter,6,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont11
	or	%g0,6,counter

	.align	16
.update12:
	cmp	counter,2
	ble	.cont12
	fzeros	%f13

	stx	%i1,[%fp+tmp_px]
	sethi	%hi(0x7f800000),%o2

	sub	counter,2,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont12
	or	%g0,2,counter

	.align	16
.update13:
	cmp	counter,2
	ble	.cont13
	fzeros	%f13

	stx	%i1,[%fp+tmp_px]
	clr	%o2

	sub	counter,2,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont13
	or	%g0,2,counter

	.align	16
.update14:
	cmp	counter,3
	ble	.cont14
	fzeros	%f17

	stx	%o4,[%fp+tmp_px]
	sethi	%hi(0x7f800000),%l1

	sub	counter,3,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont14
	or	%g0,3,counter

	.align	16
.update15:
	cmp	counter,3
	ble	.cont15
	fzeros	%f17

	stx	%o4,[%fp+tmp_px]
	clr	%l1

	sub	counter,3,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont15
	or	%g0,3,counter

	.align	16
.update16:
	cmp	counter,4
	ble	.cont16
	fzeros	%f21

	stx	%i1,[%fp+tmp_px]
	sethi	%hi(0x7f800000),%i0

	sub	counter,4,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont16
	or	%g0,4,counter

	.align	16
.update17:
	cmp	counter,4
	ble	.cont17
	fzeros	%f21

	stx	%i1,[%fp+tmp_px]
	clr	%i0

	sub	counter,4,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont17
	or	%g0,4,counter

	.align	16
.update18:
	cmp	counter,5
	ble	.cont18
	fzeros	%f25

	stx	%o7,[%fp+tmp_px]
	sethi	%hi(0x7f800000),%o2

	sub	counter,5,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont18
	or	%g0,5,counter

	.align	16
.update19:
	cmp	counter,5
	ble	.cont19
	fzeros	%f25

	stx	%o7,[%fp+tmp_px]
	clr	%o2

	sub	counter,5,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont19
	or	%g0,5,counter

	.align	16
.update20:
	cmp	counter,6
	ble	.cont20
	fzeros	%f0

	stx	%i1,[%fp+tmp_px]
	sethi	%hi(0x7f800000),%o1

	sub	counter,6,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont20
	or	%g0,6,counter

	.align	16
.update21:
	cmp	counter,6
	ble	.cont21
	fzeros	%f0

	stx	%i1,[%fp+tmp_px]
	clr	%o1

	sub	counter,6,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont21
	or	%g0,6,counter

.exit:
	ret
	restore
	SET_SIZE(__vsqrtf_ultra3)

